Code
stopifnot(
require(tidyverse),
require(patchwork),
require(httr),
require(glue),
require(broom)
)
old_theme <- theme_set(theme_minimal())stopifnot(
require(tidyverse),
require(patchwork),
require(httr),
require(glue),
require(broom)
)
old_theme <- theme_set(theme_minimal())lm_1lm_1Environment panel (Rstudio), to explore the structure of lm_1. Try to understand the signification of each element.datapath <- '../DATA'
fname <- 'Banque.csv'
fpath <- paste(datapath, fname, sep="/")if (!file.exists(fpath)) {
baseurl <- 'https://stephane-v-boucheron.fr/data'
download.file(url=paste(baseurl, fname, sep="/"),
destfile=fpath)
print(glue::glue('File {fname} downloaded at {fpath}!'))
} else {
print(glue::glue('File {fname} already exists at {fpath}!'))
}File Banque.csv already exists at ../DATA/Banque.csv!
bank <- readr::read_table(fpath,
col_types = cols(
SEXE = col_factor(levels = c("0", "1")),
CATEGORIE = col_integer(),
NB_ETUDES = col_integer(),
SATIS_EMPLOI = col_factor(levels = c("non", "oui")),
SATIS_CHEF = col_factor(levels = c("non", "oui")),
SATIS_SALAIRE = col_factor(levels = c("non", "oui")),
SATIS_COLLEGUES = col_factor(levels = c("non", "oui")),
SATIS_CE = col_factor(levels = c("non", "oui"))
)
)lm_1 <- lm(formula = SAL_ACTUEL ~ SAL_EMBAUCHE, data=bank)
lm2str_frm <- . %>%
formula() %>%
deparse()
frm_1 <- lm2str_frm(lm_1)
summary(lm_1)
Call:
lm(formula = SAL_ACTUEL ~ SAL_EMBAUCHE, data = bank)
Residuals:
Min 1Q Median 3Q Max
-35424 -4031 -1154 2584 49293
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.928e+03 8.887e+02 2.17 0.0305 *
SAL_EMBAUCHE 1.909e+00 4.741e-02 40.28 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8115 on 472 degrees of freedom
Multiple R-squared: 0.7746, Adjusted R-squared: 0.7741
F-statistic: 1622 on 1 and 472 DF, p-value: < 2.2e-16
cor(lm_1$fitted.values, bank$SAL_ACTUEL)^2[1] 0.7746068
var(lm_1$fitted.values)/var(bank$SAL_ACTUEL)[1] 0.7746068
broom::tidy()lm_1 %>%
tidy() %>%
knitr::kable(digit=2, caption = frm_1)tidy() is a generic function that can be applied to very different classes of objects
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 1928.21 | 888.68 | 2.17 | 0.03 |
| SAL_EMBAUCHE | 1.91 | 0.05 | 40.28 | 0.00 |
broom::glance()lm_1 %>%
glance() %>%
knitr::kable(digit=2, caption = frm_1)glance is also a generic function
| r.squared | adj.r.squared | sigma | statistic | p.value | df | logLik | AIC | BIC | deviance | df.residual | nobs |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0.77 | 0.77 | 8115.36 | 1622.12 | 0 | 1 | -4938.29 | 9882.58 | 9895.07 | 31085446686 | 472 | 474 |
broom::augment()lm_1_aug <- lm_1 %>%
augment(data=bank)
lm_1_aug %>%
DT::datatable(extensions = "Responsive")lm_1 is list with many named components
augment is a dataframe built from informations gathered in lm_1 and in bank
The output of augment may be described as adding 6 columns to dataframe bank. The six columns are built using items from lm_1. Can you explain their meaning and why they are relevant to diagnosing?
lm_1_aug %>%
select(starts_with(".")) %>%
head() %>%
knitr::kable(digits=2, caption = frm_1)| .fitted | .resid | .hat | .sigma | .cooksd | .std.resid |
|---|---|---|---|---|---|
| 21404.59 | -5654.59 | 0 | 8119.77 | 0 | -0.70 |
| 21404.59 | -5504.59 | 0 | 8119.99 | 0 | -0.68 |
| 20545.34 | -4345.34 | 0 | 8121.49 | 0 | -0.54 |
| 21404.59 | -5204.59 | 0 | 8120.41 | 0 | -0.64 |
| 21404.59 | -5204.59 | 0 | 8120.41 | 0 | -0.64 |
| 21404.59 | -5054.59 | 0 | 8120.61 | 0 | -0.62 |
Let base R produce diagnostic plots
plot(lm_1, which = 1:6)We will reproduce (and discuss) four of the six diagnostic plots provided by the plot method from base R (1,2,3,5).
ggplot using the aumented version of lm_1 (augment(lm_1)).p_1_lm_1 <- lm_1_aug %>%
ggplot() +
aes(x=.fitted, y=.resid)+
geom_point(alpha=.5, size=.5) +
geom_smooth(formula = y ~ x,
method="loess",
se=F,
linetype="dotted",
linewidth=.5,
color="black") +
xlab("Fitted values") +
ylab("Residuals)") +
ggtitle("Bank dataset",
subtitle = frm_1) +
labs(caption = "Residuals versus Fitted")Comment Diagnostic Plot 1.
Compute the correlation coefficient between residuals and fitted values.
Make your graphic pipeline a reusable function.
make_p_diag_1 <- function(lm.){
augment(lm.) %>%
ggplot() +
aes(x=.fitted, y=.resid)+
geom_point(alpha=.5, size=.5) +
geom_smooth(method="loess",
formula = y ~ x,
se=F,
linetype="dotted",
size=.5,
color="black") +
xlab("Fitted values") +
ylab("Residuals)") +
labs(title = "Residuals versus Fitted")
}ggplot.p_3_lm_1 <- lm_1_aug %>%
ggplot() +
aes(x=.fitted, y=sqrt(abs(.std.resid))) +
geom_smooth(formula = y ~ x,
se=F,
method="loess",
linetype="dotted",
size=.5,
color="black") +
xlab("Fitted values") +
ylab("sqrt(standardized residuals)") +
geom_point(size=.5, alpha=.5) +
ggtitle("Bank dataset",
subtitle = frm_1) +
labs(caption = "Scale location")Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Make your graphic pipeline a reusable function.
make_p_diag_3 <- function(lm.){
augment(lm.) %>%
ggplot() +
aes(x=.fitted, y=sqrt(abs(.std.resid))) +
geom_smooth(formula = y ~ x,
method="loess",
se=F,
linetype="dotted",
size=.5,
color="black") +
xlab("Fitted values") +
ylab("sqrt(standardized residuals)") +
geom_point(size=.5, alpha=.5) +
labs(title = "Scale location")
}ggplot.p_5_lm_1 <- lm_1_aug %>%
ggplot() +
aes(x=.hat, y=((.std.resid))) +
geom_point(size=.5, alpha=.5) +
xlab("Leverage") +
ylab("Standardized residuals") +
ggtitle("Bank dataset",
subtitle = frm_1)
# plot(lm.1, which = 5)make_p_diag_5 <- function(lm.){
augment(lm.) %>%
ggplot() +
aes(x=.hat, y=((.std.resid))) +
geom_point(size=.5, alpha=.5) +
xlab("Leverage") +
ylab("Standardized residuals") +
labs(title = "Standardized residulas versus Leverages")
}In the second diagnostic plot (the residuals qqplot), we build a quantile-quantile plot by plotting function \(F_n^{\leftarrow} \circ \Phi\) where \(\Phi\) is the ECDF of the standard Gaussian distribution while \(F^\leftarrow_n\).
ggplotp_2_lm_1 <- lm_1_aug %>%
ggplot() +
aes(sample=.resid) +
geom_qq(size=.5, alpha=.5) +
stat_qq_line(linetype="dotted",
size=.5,
color="black") +
ggtitle("Bank dataset",
subtitle = frm_1) +
labs(caption="Residuals qqplot") +
xlab("Theoretical quantiles") +
ylab("Empirical quantiles of residuals")
# plot(lm_1, which = 2)make_p_diag_2 <- function(lm.){
augment(lm.) %>%
ggplot() +
aes(sample=.resid) +
geom_qq(size=.5, alpha=.5) +
stat_qq_line(linetype="dotted",
size=.5,
color="black") +
labs(title="Residuals qqplot")
}patchwork::... to collect your four diagnostic plotslyt <- patchwork::plot_layout(ncol=2, nrow=2)
(make_p_diag_1(lm_1) +
make_p_diag_2(lm_1) +
make_p_diag_3(lm_1) +
make_p_diag_5(lm_1) ) +
patchwork::plot_annotation(caption='SAL_ACTUEL ~ SAL_EMBAUCHE') # DRY this ?p_1_lm_1 + p_2_lm_1 + p_3_lm_1 + p_5_lm_1SAL_ACTUELp_1_bis_lm_1 <- lm_1_aug %>%
ggplot() +
aes(x=.fitted, y=SAL_ACTUEL)+
geom_point(alpha=.5, size=.5) +
geom_smooth(formula = y ~ x,
method="loess",
se=F,
linetype="dotted",
size=.5,
color="black") +
xlab("Fitted values") +
ylab("SAL_ACTUEL") +
ggtitle("Bank dataset",
subtitle = frm_1) +
labs(caption = "SAL_ACTUEL versus Fitted")
p_1_bis_lm_1AGE and SAL_ACTUELRedo the above described steps and call the model lm_2.
lm_2 <- lm(SAL_ACTUEL ~ AGE, data=bank)
lm_2 %>%
tidy()# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 42272. 2571. 16.4 2.30e-48
2 AGE -211. 65.8 -3.20 1.45e- 3
lyt <- patchwork::plot_layout(ncol=2, nrow=2)
make_p_diag_1(lm_2) +
make_p_diag_2(lm_2) +
make_p_diag_3(lm_2) +
make_p_diag_5(lm_2)ggplot programming : write a function with arguments df, varx and vary where varx and vary are two strings denoting numerical columns in df, that outputs a ggplot object made of a scatterplot of columns vary and vary, a linear regression of vary against varx. The ggplot plot object should be annotated with the linear correlation coefficient of vary and varx and equipped with a title.bank %>%
ggplot() +
aes(x=AGE, y=SAL_ACTUEL) +
geom_point(alpha=.5, size=.5, ) +
geom_smooth(method="lm", formula= y ~ x, se=F) +
annotate(geom="text", x=60, y=60000,
label=str_c("rho = ",
round(cor(bank$SAL_ACTUEL, bank$AGE), 2))) +
ggtitle("Bank dataset")ggplot_lin_reg <- function(df, varx, vary){
rho <- round(cor(df[[varx]], df[[vary]]), 2)
posx <- sum(range(df[[varx]])*c(.25 , .75))
posy <- sum(range(df[[vary]])*c(.5 , .5))
df %>%
ggplot() +
aes(x=.data[[varx]], y=.data[[vary]]) +
geom_point(alpha=.5, size=.5, ) +
geom_smooth(method="lm", formula= y ~ x, se=F) +
annotate(geom="text", x=posx, y=posy,
label=glue("Linear Cor. Coeff.: {rho}")) +
ggtitle(glue("Linear regression: {vary} ~ {varx}"))
}
ggplot_lin_reg(bank, "AGE", "SAL_ACTUEL")Inspect rows with high Cook’s distance
lm_1_aug %>%
filter(.cooksd> 2*mean(.cooksd)) %>%
select(-starts_with(".")) %>%
DT::datatable()SAL_ACTUEL and AGEbank %>%
# select(-id) %>%
select(where(is.numeric)) %>%
corrr::correlate() %>%
corrr::shave() %>%
corrr::rplot()Correlation computed with
• Method: 'pearson'
• Missing treated using: 'pairwise.complete.obs'
SAL_ACTUEL as a function of age AGETo perform linear fitting, we choose \(450\) points amongst the \(474\) sample points: the \(24\) remaining points are used to assess the merits of the linear fit.
banque dataframe.Function sample from base R is convenient. You may also enjoy slice_sample() from dplyr. Denote by trainset the vector of of selected indices. Bind the vector of left behind indices to variable testset. Functions match, setdiff or operator %in% may be useful.
old_seed <- set.seed(42)
trainset_size <- 450
trainset <- sample(seq(nrow(bank)) , trainset_size)
testset <- setdiff(seq(nrow(bank)) , trainset)
trainset <- as.integer(trainset)
testset <- as.integer(testset)lm_3 <- lm(SAL_ACTUEL ~ AGE, data=bank[trainset,] )
#
lm_3_aug <- lm_3 %>%
augment(data=bank[trainset,] )lm_3 %>%
augment(data=bank[trainset,]) %>%
ggplot() +
aes(x=AGE, y=SAL_ACTUEL) +
geom_point(alpha=.5, size=.5) +
geom_smooth(method="lm", formula= y ~ x, se=F) +
annotate(geom="text", x=60, y=60000,
label=str_c("rho = ",
round(cor(bank$SAL_ACTUEL, bank$AGE), 2))) +
ggtitle("Bank dataset",
subtitle = "Red: test set, Blue: high Cook's distance") +
geom_point(data=augment(lm_3, newdata=bank[testset,]),
color="red",
alpha=.85, size=1) +
geom_point(data=filter(lm_3_aug, .cooksd> 2*mean(.cooksd)),
color="blue",
alpha=.85, size=1
)Inspecting points with high Cook’s distance
lm_3_aug %>%
filter(.cooksd> 2*mean(.cooksd)) %>%
select(-starts_with(".")) %>%
DT::datatable()make_p_diag_1(lm_3) +
make_p_diag_2(lm_3) +
make_p_diag_3(lm_3) +
make_p_diag_5(lm_3)lm_3_aug_test <- augment(lm_3, newdata = bank[testset,])(make_p_diag_1(lm_3) %+% lm_3_aug_test) +
(make_p_diag_2(lm_3) %+% lm_3_aug_test)\[ \begin{pmatrix} Y \end{pmatrix} = \begin{pmatrix} \mathbb{Z} \end{pmatrix} \times \beta + \sigma \begin{pmatrix} \epsilon \end{pmatrix} \]
old_seed <- set.seed(5783)
# lm_1 %>%
# tidy()
#lm_1 %>% summarylm2design <- . %$% # exposing pipe from magrittr
select(.$model, -ncol(.$model)) %>%
mutate(ctt = 1) %>%
select(ctt, everything()) %>%
as.matrix() # design matrix sigma_hat <- sqrt(sum(lm_1$residuals^2)/lm_1$df.residual)
sal_actuel_fake <- lm2design(lm_1) %*% lm_1$coefficients +
sigma_hat * rnorm(nrow(lm_1$model))
lm_1_fake <- bind_cols(bank,
SAL_ACTUEL_FAKE= sal_actuel_fake) %>%
lm(formula=SAL_ACTUEL_FAKE ~ SAL_EMBAUCHE, data=.)summary(lm_1_fake)
Call:
lm(formula = SAL_ACTUEL_FAKE ~ SAL_EMBAUCHE, data = .)
Residuals:
Min 1Q Median 3Q Max
-60551 -10758 -1974 7942 101530
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6722.120 1930.957 3.481 0.000545 ***
SAL_EMBAUCHE 3.606 0.103 35.003 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 17630 on 472 degrees of freedom
Multiple R-squared: 0.7219, Adjusted R-squared: 0.7213
F-statistic: 1225 on 1 and 472 DF, p-value: < 2.2e-16
#
make_p_diag_1(lm_1_fake) +
make_p_diag_2(lm_1_fake) +
make_p_diag_3(lm_1_fake) +
make_p_diag_5(lm_1_fake)